library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   2.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyr)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(NbClust)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(cluster)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
df=read.csv("E:\\GRAD\\fda\\Project1\\World Indicators.csv",na.strings = "")

df$Energy.Usage=NULL
df$Lending.Interest=NULL
colSums(is.na(df))
##          ï..Birth.Rate      Business.Tax.Rate Days.to.Start.Business 
##                      9                     27                     27 
##                    GDP       Health.Exp...GDP      Health.Exp.Capita 
##                     20                     23                     23 
##        Hours.to.do.Tax  Infant.Mortality.Rate         Internet.Usage 
##                     28                     20                      9 
## Life.Expectancy.Female   Life.Expectancy.Male     Mobile.Phone.Usage 
##                     11                     11                     12 
##        Population.0.14       Population.15.64         Population.65. 
##                     17                     17                     17 
##       Population.Urban                 Region                Country 
##                      2                      0                      0
library(ggplot2)
#Outkier percentage return function
perc_out=function(a){
  length(which(a>  mean(a) + 3 * sd(a) | a < mean(a) - 3 * sd(a))  ) / length(a)
}
names(df)
##  [1] "ï..Birth.Rate"          "Business.Tax.Rate"      "Days.to.Start.Business"
##  [4] "GDP"                    "Health.Exp...GDP"       "Health.Exp.Capita"     
##  [7] "Hours.to.do.Tax"        "Infant.Mortality.Rate"  "Internet.Usage"        
## [10] "Life.Expectancy.Female" "Life.Expectancy.Male"   "Mobile.Phone.Usage"    
## [13] "Population.0.14"        "Population.15.64"       "Population.65."        
## [16] "Population.Urban"       "Region"                 "Country"
#Cleaning Birth rate
boxplot(df$ï..Birth.Rate)

df$ï..Birth.Rate[is.na(df$ï..Birth.Rate)]=mean(df$ï..Birth.Rate,trim=2*perc_out(na.omit(df$ï..Birth.Rate)),na.rm = TRUE)
ggplot(df, aes(x=ï..Birth.Rate)) +  geom_density()

df$ï..Birth.Rate=df$ï..Birth.Rate**0.25
#cleaning Business.Tax.Rate
df$Business.Tax.Rate=gsub(pattern="%",replacement="",df$Business.Tax.Rate)
df$Business.Tax.Rate=as.numeric(df$Business.Tax.Rate)
boxplot(df$Business.Tax.Rate)

df$Business.Tax.Rate[is.na(df$Business.Tax.Rate)]=mean(df$Business.Tax.Rate,trim=2*perc_out(na.omit(df$Business.Tax.Rate)),na.rm = TRUE)
ggplot(df, aes(x=Business.Tax.Rate)) +  geom_density()

df$Business.Tax.Rate=df$Business.Tax.Rate**0.2
#cleaning Days.to.Start.Business


boxplot(df$Days.to.Start.Business)

df$Days.to.Start.Business[is.na(df$Days.to.Start.Business)]=mean(df$Days.to.Start.Business,trim=2*perc_out(na.omit(df$Days.to.Start.Business)),na.rm = TRUE)
ggplot(df, aes(x=Days.to.Start.Business)) +  geom_density()

df$Days.to.Start.Business=df$Days.to.Start.Business**0.2
#Cleaning GDP

df$GDP=gsub(pattern="[[:punct:]]",replacement="",df$GDP)
df$GDP=as.numeric(df$GDP)
boxplot(df$GDP)

df$GDP[is.na(df$GDP)]=mean(df$GDP,trim=2*perc_out(na.omit(df$GDP)),na.rm = TRUE)
ggplot(df, aes(x=GDP)) +  geom_density()

df$GDP=df$GDP**0.1
#cleaning Health.Exp...GDP

boxplot(df$Health.Exp...GDP)

df$Health.Exp...GDP[is.na(df$Health.Exp...GDP)]=mean(df$Health.Exp...GDP,trim=2*perc_out(na.omit(df$Health.Exp...GDP)),na.rm = TRUE)
ggplot(df, aes(x=Health.Exp...GDP)) +  geom_density()

df$Health.Exp...GDP=df$Health.Exp...GDP**0.8
#Cleaning Health.Exp.Capita

df$Health.Exp.Capita=gsub(pattern="[[:punct:]]",replacement="",df$Health.Exp.Capita)
df$Health.Exp.Capita=as.numeric(df$Health.Exp.Capita)
boxplot(df$Health.Exp.Capita)

df$Health.Exp.Capita[is.na(df$Health.Exp.Capita)]=mean(df$Health.Exp.Capita,trim=2*perc_out(na.omit(df$Health.Exp.Capita)),na.rm = TRUE)
ggplot(df, aes(x=Health.Exp.Capita)) +  geom_density()

df$Health.Exp.Capita=df$Health.Exp.Capita**0.2
#Cleaning Hours.to.do.Tax

boxplot(df$Hours.to.do.Tax)

df$Hours.to.do.Tax[is.na(df$Hours.to.do.Tax)]=mean(df$Hours.to.do.Tax,trim=2*perc_out(na.omit(df$Hours.to.do.Tax)),na.rm = TRUE)
ggplot(df, aes(x=Hours.to.do.Tax)) +  geom_density()

df$Hours.to.do.Tax=df$Hours.to.do.Tax**0.2
boxplot(df$Infant.Mortality.Rate)

df$Infant.Mortality.Rate[is.na(df$Infant.Mortality.Rate)]=mean(df$Infant.Mortality.Rate,trim=2*perc_out(na.omit(df$Infant.Mortality.Rate)),na.rm = TRUE)
ggplot(df, aes(x=Infant.Mortality.Rate)) +  geom_density()

df$Infant.Mortality.Rate=df$Infant.Mortality.Rate**0.33
boxplot(df$Internet.Usage)

df$Internet.Usage[is.na(df$Internet.Usage)]=mean(df$Internet.Usage,trim=2*perc_out(na.omit(df$Internet.Usage)),na.rm = TRUE)

ggplot(df, aes(x=Internet.Usage)) +  geom_density()

boxplot(df$Life.Expectancy.Female)

df$Life.Expectancy.Female[is.na(df$Life.Expectancy.Female)]=mean(df$Life.Expectancy.Female,trim=2*perc_out(na.omit(df$Life.Expectancy.Female)),na.rm = TRUE)

ggplot(df, aes(x=Life.Expectancy.Female)) +  geom_density()

df$Life.Expectancy.Female=df$Life.Expectancy.Female**3
df$Life.Expectancy.Female=df$Life.Expectancy.Female**2
boxplot(df$Life.Expectancy.Male)

df$Life.Expectancy.Male[is.na(df$Life.Expectancy.Male)]=mean(df$Life.Expectancy.Male,trim=2*perc_out(na.omit(df$Life.Expectancy.Male)),na.rm = TRUE)
ggplot(df, aes(x=Life.Expectancy.Male)) +  geom_density()

df$Life.Expectancy.Male=df$Life.Expectancy.Male**2
boxplot(df$Mobile.Phone.Usage)

df$Mobile.Phone.Usage[is.na(df$Mobile.Phone.Usage)]=mean(df$Mobile.Phone.Usage,trim=2*perc_out(na.omit(df$Mobile.Phone.Usage)),na.rm = TRUE)
ggplot(df, aes(x=Mobile.Phone.Usage)) +  geom_density()

boxplot(df$Population.0.14)

df$Population.0.14[is.na(df$Population.0.14)]=mean(df$Population.0.14,trim=2*perc_out(na.omit(df$Population.0.14)),na.rm = TRUE)
ggplot(df, aes(x=Population.0.14)) +  geom_density()

boxplot(df$Population.15.64)

df$Population.15.64[is.na(df$Population.15.64)]=mean(df$Population.15.64,trim=2*perc_out(na.omit(df$Population.15.64)),na.rm = TRUE)
ggplot(df, aes(x=Population.15.64)) +  geom_density()

boxplot(df$Population.65.)

df$Population.65.[is.na(df$Population.65.)]=mean(df$Population.65.,trim=2*perc_out(na.omit(df$Population.65.)),na.rm = TRUE)
ggplot(df, aes(x=Population.65.)) +  geom_density()

df$Population.65.=df$Population.65.**0.33
boxplot(df$Population.Urban)

df$Population.Urban[is.na(df$Population.Urban)]=mean(df$Population.Urban,trim=2*perc_out(na.omit(df$Population.Urban)),na.rm = TRUE)
ggplot(df, aes(x=Population.Urban)) +  geom_density()

x=data.frame(scale(df[,1:16]))
y=df[,18]
#Finding optimal number of clusters
fviz_nbclust(x,kmeans,method="wss")

fviz_nbclust(df[,1:3],kmeans,method="silhouette")

#apply Kmeans
km=kmeans(x,2,nstart=20)
df$class_k=km$cluster

#Apply heirarchical clustering
hc.single=hclust(dist(x),method="centroid")
plot(hc.single)

df$class_h=cutree(hc.single,7)
sil_k <- silhouette(df$class_k, dist(x))
plot(sil_k, main ="Silhouette plot - K-means")

sil_h<- silhouette(df$class_h, dist(x))
plot(sil_h, main ="Silhouette plot - hierarchical-means")

#Filling summary
compare=data.frame(matrix(ncol = 3, nrow = 0))
names(compare)=c("Parameter","Kmeans","hierarchical")

compare=rbind(compare,c("Silhoutte",mean(sil_k[,3]),mean(sil_h[,3])))
#Find Dunn index to make sure they have been clustered well
library(fpc)
k_stats <- cluster.stats(dist(x),  df$class_k)
h_stats <- cluster.stats(dist(x),  df$class_h)

compare=rbind(compare,c("Dunn index",k_stats$dunn,h_stats$dunn))
names(compare)=c("Parameter","Kmeans","hierarchical")

print(compare)
##    Parameter            Kmeans        hierarchical
## 1  Silhoutte 0.335333571169999 -0.0297076394297334
## 2 Dunn index 0.154051727097344   0.133924939621907
#It is clear that Kmeans performed well tha Heirarchical method since silhoutte index and dunn index is more than the other
#Comparing Health expense capita and GDP
plot_ly(df, x = ~Health.Exp.Capita, y = ~GDP,  color = ~as.factor(class_k))
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#From the graph it is clear that countries lying on 1st and 4th quarants are considered as a group 
#Comparing Infant.Mortality.Rate and GDP
plot_ly(df, x = ~Infant.Mortality.Rate, y = ~GDP,  color = ~as.factor(class_k))
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#From the graph it is clear that countries lying on 1st and 4th quarants are considered as a group 
#Comparing Health.Exp.Capita ,ï..Birth.Rate,Health.Exp...GDP

plot_ly(df, x = ~Health.Exp...GDP, y = ~Health.Exp.Capita, z=df$ï..Birth.Rate, color = ~as.factor(class_k))
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
#From the graph it is clear that countries lying on 1st and 4th quarants are considered as a group
names(df)
##  [1] "ï..Birth.Rate"          "Business.Tax.Rate"      "Days.to.Start.Business"
##  [4] "GDP"                    "Health.Exp...GDP"       "Health.Exp.Capita"     
##  [7] "Hours.to.do.Tax"        "Infant.Mortality.Rate"  "Internet.Usage"        
## [10] "Life.Expectancy.Female" "Life.Expectancy.Male"   "Mobile.Phone.Usage"    
## [13] "Population.0.14"        "Population.15.64"       "Population.65."        
## [16] "Population.Urban"       "Region"                 "Country"               
## [19] "class_k"                "class_h"

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.